# Configuration file for the Sphinx documentation builder.
#
# For the full list of built-in configuration values, see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

import os
import re
import sys
import shutil
from pathlib import Path
from data_juicer import __version__ as version

from sphinx import project as sphinx_project

release = version

# -- Path setup --------------------------------------------------------------
sys.path.insert(0, os.path.abspath("../../"))

# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
project = "data_juicer"
copyright = "2024, Data-Juicer Team"
author = "Data-Juicer Team"

# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
source_suffix = {".rst": "restructuredtext", ".md": "markdown"}

extensions = [
    "sphinx.ext.autodoc",
    "sphinx.ext.autosummary",
    "sphinx.ext.viewcode",
    "sphinx.ext.napoleon",
    "sphinx.ext.autosectionlabel",
    "myst_parser",
    "sphinx_copybutton",
    "sphinx_multiversion",
    "sphinxcontrib.apidoc",  # TODO: Replace with sphinx.ext.apidoc when sphinx>=8.2
]

# -- Extension configuration ------------------------------------------------
myst_heading_anchors = 4
myst_enable_extensions = [
    "linkify",
    "tasklist",
]

# sphinx_multiversion configuration
# smv_tag_whitelist = r"^v\d+\.\d+\.\d+$"
smv_tag_whitelist = rf"^v{release}$"
smv_branch_whitelist = r"^main$"
smv_released_pattern = r"^refs/tags/v\d+\.\d+\.\d+$"
smv_remote_whitelist = r'^origin$'

# apidoc settings
apidoc_module_dir = "../../../data_juicer"
apidoc_output_dir = "./"

# Prefix document path to section labels, otherwise autogenerated labels would
# look like 'heading' rather than 'path/to/file:heading'
autosectionlabel_prefix_document = True
autosummary_generate = True
autosummary_ignore_module_all = False
autodoc_member_order = "bysource"

# -- Templates and patterns -------------------------------------------------
templates_path = ["_templates"]
exclude_patterns = ["build", "demos/process_video_on_ray/data/Note.md"]

# -- Options for HTML output ------------------------------------------------
# The theme to use for HTML and HTML Help pages.
# See the documentation for a list of builtin themes.
html_theme = "furo"
html_title = "data-juicer"

# Sidebar configuration
html_sidebars = {
    "**": [
        "sidebar/brand.html",
        "sidebar/search.html",
        "sidebar/scroll-start.html",
        "sidebar/navigation.html",
        "sidebar/scroll-end.html",
        "sidebar/bottom_menu.html",
    ],
}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_css_files = ["sidebar-menu.css"]
html_static_path = ["_static"]

# -- Internationalization settings ------------------------------------------
# Language settings
language = "en"
locale_dirs = ["locale/"]  # path is example but recommended
gettext_compact = False  # optional

# List of supported languages
supported_languages = {
    "en": "English",
    "zh_CN": "简体中文",
    # 'ja': '日本語',
}


def get_lang_link(language, pagename, lang_code, non_zh_pages=[], current_version=""):
    """Generate language specific links for documentation pages"""
    base_path = "../../" if current_version else "../"

    def norm_pagename(pagename):
        return os.path.normpath(pagename)

    norm_non_zh_pages = set(map(norm_pagename, non_zh_pages))
    target_page = pagename

    if "CN" in language and pagename.endswith("_ZH") and "CN" not in lang_code:
        target_page = pagename[:-3]
    if "CN" in lang_code and not pagename.endswith("_ZH"):
        if norm_pagename(pagename) not in norm_non_zh_pages:
            target_page += "_ZH"

    return f"{base_path}{lang_code}/{current_version}{target_page}.html"


html_context = {
    "supported_languages": supported_languages,
    "get_lang_link": get_lang_link,
}


# -- setup configuration ------------------------------------------------
def find_zh_exclusions(app, config):
    """
    Find Chinese translation files to exclude when building English documentation
    """
    non_zh_pages = set()
    zh_exclusions = []

    for root, dirs, files in os.walk(app.srcdir):
        for file in files:
            # Check for files with English base names and corresponding _ZH versions
            if not file.endswith(("_ZH.md", "_ZH.rst")):
                base_name, ext = os.path.splitext(file)
                zh_file = f"{base_name}_ZH{ext}"
                zh_file_path = os.path.join(root, zh_file)
                rel_path = os.path.normpath(
                    os.path.relpath(os.path.join(root, file), app.srcdir)
                )

                # If Chinese version exists, add to exclusions
                if os.path.exists(zh_file_path):
                    zh_exclusions.append(rel_path)
                else:
                    non_zh_pages.add(
                        os.path.normpath(
                            os.path.relpath(os.path.join(root, base_name), app.srcdir)
                        )
                    )

    if config.language == "zh_CN":
        config.exclude_patterns.extend(zh_exclusions)
    else:
        config.exclude_patterns.extend(["*_ZH*", "**/*_ZH*"])

    app.config.html_context.setdefault("non_zh_pages", set()).update(non_zh_pages)


def create_symlinks(source_dir):
    """Create symbolic links for markdown files in the documentation"""
    # Use app.srcdir to get the current version of the document source directory
    project_root = source_dir.parent.parent.parent

    for md_file in project_root.rglob("*.md"):
        exclude_paths = ["outputs", "sphinx_doc"]

        if any(path in str(md_file) for path in exclude_paths):
            continue

        target = source_dir / md_file.relative_to(project_root)
        target.parent.mkdir(parents=True, exist_ok=True)

        if not target.exists():
            target.symlink_to(os.path.relpath(md_file, target.parent))


def update_metadata_docnames(app, config):
    if hasattr(app.config, "smv_metadata"):
        metadata = app.config.smv_metadata
    else:
        print(
            "smv_metadata not found in app.config.  sphinx_multiversion likely not initialized yet."
        )
        return

    main_sourcedir = metadata["main"].get("sourcedir")
    source_suffixes = config.source_suffix
    project = sphinx_project.Project(main_sourcedir, source_suffixes)
    updated_docnames = list(project.discover())
    for version_name, _ in metadata.items():
        app.config.smv_metadata[version_name]["docnames"] = updated_docnames

def versions_a_lt_or_eq_to_b(version_a, version_b):

    def is_valid_version(version):
        return bool(re.match(r"^v\d+\.\d+\.\d+$", version))

    if not is_valid_version(version_a):
        return False

    if not is_valid_version(version_b):
        raise ValueError(f"Invalid version format for version_b: {version_b}")

    try:
        parts_a = [int(x) for x in version_a[1:].split(".")]
        parts_b = [int(x) for x in version_b[1:].split(".")]

        for i in range(3):
            if parts_a[i] < parts_b[i]:
                return True
            elif parts_a[i] > parts_b[i]:
                return False

        return True
    except ValueError:
        raise ValueError("Version numbers must be numeric")

def rebuild_source_dir(app, config):
    """Rebuild source directory for documentation"""
    source_dir = Path(app.srcdir)
    create_symlinks(source_dir)

    update_metadata_docnames(app, config)
    # Find Chinese translation files
    find_zh_exclusions(app, config)


def skip(app, what, name, obj, would_skip, options):
    """Control which members to skip in documentation"""
    if name == "__init__":
        return False
    return would_skip


def process_doc_links(app, docname, source):
    """Process and update documentation links"""
    repo_base = "https://github.com/modelscope/data-juicer/blob/main/"

    def link_replacer(match):
        text, path = match.group(1), match.group(2)
        abs_path = os.path.normpath(os.path.join(os.path.dirname(docname), path))
        return f"[{text}]({repo_base}{abs_path})"

    pattern = r"\[([^\]]+)\]\((?!http|#)([^)]*(?<!\.md)(?<!\.rst))\)"
    source[0] = re.sub(pattern, link_replacer, source[0])
    return source[0]


def process_tutorial(app, docname, source):
    """Process tutorial during reading"""
    overview_placeholder = ""
    api_placeholder = ""
    if app.config.language == "zh_CN":
        overview_placeholder = "- [DJ概览](../../README_ZH.md)"
        api_placeholder = "API参考"
    else:
        overview_placeholder = "- [Overview of DJ](../../README.md)"
        api_placeholder = "API references"
    source[0] = source[0].replace(overview_placeholder, "")
    source[0] = source[0].replace(
        f"[{api_placeholder}](https://modelscope.github.io/data-juicer/)",
        f"[{api_placeholder}](api.rst)",
    )
    pattern = r"(?i)\nen[A-Za-z\s]{0,12}\|\s*\[\u4e2d\u6587[\u4e00-\u9fa5\s]{0,12}\]\([^)]+\.md\)|\n\u4e2d\u6587[\u4e00-\u9fa5\s]{0,12}\|\s*\[en[A-Za-z\s]{0,12}\]\([^)]+\.md\)"
    source[0] = re.sub(pattern, "", source[0])
    return source[0]


def process_read(app, docname, source):
    """Process document during reading"""
    source[0] = process_tutorial(app, docname, source)
    source[0] = process_doc_links(app, docname, source)


def copy_sphinx_doc_to_build(app, config):
    """Copies the entire project directory to the Sphinx build directory."""
    source_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
    dest_dir = app.srcdir.parent.parent.parent / "docs/sphinx_doc"

    try:
        shutil.copytree(source_dir, dest_dir, dirs_exist_ok=True)
    except Exception as e:
        print(f"Error copying documentation: {e}")


def setup(app):
    """Setup Sphinx application hooks"""
    current_version = app.config.smv_current_version
    print(f"Current version: {current_version}")
    app.config.smv_latest_version = f"v{release}"
    app.connect("config-inited", copy_sphinx_doc_to_build)
    app.connect("config-inited", rebuild_source_dir)
    app.config.root_doc = "index_ZH" if app.config.language == "zh_CN" else "index"

    app.connect("source-read", process_read)
    app.connect("autodoc-skip-member", skip)
